# import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
import nltk
# read csv
df = pd.read_csv('/Users/wanghan/Documents/ads-spring2023-project1-wangyeye66/data/philosophy_data.csv')
df.head(3)
| title | author | school | sentence_spacy | sentence_str | original_publication_date | corpus_edition_date | sentence_length | sentence_lowered | tokenized_txt | lemmatized_str | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Plato - Complete Works | Plato | plato | What's new, Socrates, to make you leave your ... | What's new, Socrates, to make you leave your ... | -350 | 1997 | 125 | what's new, socrates, to make you leave your ... | ['what', 'new', 'socrates', 'to', 'make', 'you... | what be new , Socrates , to make -PRON- lea... |
| 1 | Plato - Complete Works | Plato | plato | Surely you are not prosecuting anyone before t... | Surely you are not prosecuting anyone before t... | -350 | 1997 | 69 | surely you are not prosecuting anyone before t... | ['surely', 'you', 'are', 'not', 'prosecuting',... | surely -PRON- be not prosecute anyone before ... |
| 2 | Plato - Complete Works | Plato | plato | The Athenians do not call this a prosecution b... | The Athenians do not call this a prosecution b... | -350 | 1997 | 74 | the athenians do not call this a prosecution b... | ['the', 'athenians', 'do', 'not', 'call', 'thi... | the Athenians do not call this a prosecution ... |
To explore the data, we pick three most important features: title, author and school. By plotting the distribution, we clearly found that Aristotle, Plato and Hegel are the top three authors in our dataset. That makes sense, since they are so well-known. When it comes to the schools, the top three schools are analytic, aristotle, and german idealism in our dataset.
# categorical data distributions
features = ['title', 'author', 'school']
for f in features:
plt.figure(figsize=(16,10), dpi = 80)
df[f].value_counts().plot(kind = 'bar')
plt.title(f)
plt.grid()
plt.show()
Now, lets focus on numerical data. Sentence length is an important feature in this dataset. The median sentence length in this data set is 127 words. The longest sentense is 2649 words! I seems philosphers love to say long sentences. It is also interesting to find that the sentence length is about distributed normally with log-scale. The result is what I expected.
# show distributions of numerical data before moving forward
print(df.sentence_length.describe())
plt.figure(figsize=(12,5))
sns.displot(df['sentence_length'], kde =True)
plt.title('Sentence Length Distribution')
plt.xlabel('Sentence Length')
plt.ylabel('Counts')
# plt.savefig('sentence_length_dist.png', format='png', dpi=80, bbox_inches='tight')
# looks like a normal distribution
plt.figure(figsize=(12,5))
sns.displot(df['sentence_length'],log_scale = True, kde= True, bins=50)
plt.title('Sentence Length Distribution with log-scale')
plt.xlabel('Sentence Length')
plt.ylabel('Counts')
# plt.savefig('sentence_length_dist_log.png', format='png', dpi=80, bbox_inches='tight')
count 360808.000000 mean 150.790964 std 104.822072 min 20.000000 25% 75.000000 50% 127.000000 75% 199.000000 max 2649.000000 Name: sentence_length, dtype: float64
Text(-8.805555555555555, 0.5, 'Counts')
<Figure size 1200x500 with 0 Axes>
<Figure size 1200x500 with 0 Axes>
The distribution of corpus edition date is cluster at year 1990 to 2010.
# corpus edition date cluster around year 1990~2010
sns.displot(df['corpus_edition_date'],kde = True, bins = 100)
<seaborn.axisgrid.FacetGrid at 0x7ff6587b80d0>
To answer this, I averaged up the sentence length for each author and displayed in a horizontal bar plot. Then, rank the averaged value from the highest to lowest. It is clearly shown on the figure that, Descartes is the most verbose Author among all philosophers in our dataset.
# sentence length comparison over authors
avg_len = df['sentence_length'].groupby(df['author']).mean()
avg_len_df = pd.DataFrame({'author': avg_len.index, 'length':avg_len.values})
avg_len_df = avg_len_df.sort_values(by = ['length'], ascending = False)
avg_len_df
plt.figure(figsize=(15,10), dpi = 80)
ax = sns.barplot(x = 'length',
y = 'author',
palette = 'Set2',
data = avg_len_df)
ax.set_title('Author and their average sentence length', fontsize=15)
ax.set_ylabel('Author Name', fontsize=15)
ax.set_xlabel('Length', fontsize=15)
Text(0.5, 0, 'Length')
Then, I plot a figure showing the sentence length comparison on different schools. However, there is no correlation between the sentence length and different type of schools.
# sentence length by schools
plt.figure(figsize=(20,15))
sns.boxplot(y ='school',
x = 'sentence_length',
palette = 'Set2',
data = df)
plt.title('Sentence Length by schools', fontsize = 20)
plt.xlabel('Sentence Length',fontsize=15)
plt.ylabel('Schools',fontsize=15)
Text(0, 0.5, 'Schools')
The following figures are wordclouds for each school. For example, the school of Plato, the most frquent words are: one, thing, will, god,soul...... We can compare the wordclouds from different schools.
from wordcloud import WordCloud, STOPWORDS
schools = df.school.unique()
# create a wordcloud for each school
t1 = time.time()
for i in schools:
df_temp = df[df.school == i]
print('School = ', i.upper(), ':')
text = " ".join(txt for txt in df_temp.sentence_lowered)
wordcloud = WordCloud(width = 800, height = 800,
max_font_size = 50,
max_words= 500,
background_color='white',
stopwords = STOPWORDS).generate(text)
plt.figure(figsize=(15,15))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
t2 = time.time()
print('Elapsed time: ', np.round(t2-t1, 2))
School = PLATO :
School = ARISTOTLE :
School = EMPIRICISM :
School = RATIONALISM :
School = ANALYTIC :
School = CONTINENTAL :
School = PHENOMENOLOGY :
School = GERMAN_IDEALISM :
School = COMMUNISM :
School = CAPITALISM :
School = STOICISM :
School = NIETZSCHE :
School = FEMINISM :
Elapsed time: 36.87
Finally, I vectorized the each word in sentences using H2OWord2vecEstimator. After training the model, We can put in a word and try to let the model to select the synonyms. The 'find_synonyms' method will tell you the word select and the how that word correlated to the input word.
# to build a model that finding synonyms based on what philosophers said
import h2o
from h2o.estimators import H2OWord2vecEstimator, H2OGradientBoostingEstimator
h2o.init()
Checking whether there is an H2O instance running at http://localhost:54321 . connected. Warning: Your H2O cluster version is too old (7 months and 21 days)!Please download and install the latest version from http://h2o.ai/download/
| H2O_cluster_uptime: | 1 day 19 hours 26 mins |
| H2O_cluster_timezone: | Asia/Shanghai |
| H2O_data_parsing_timezone: | UTC |
| H2O_cluster_version: | 3.38.0.4 |
| H2O_cluster_version_age: | 7 months and 21 days !!! |
| H2O_cluster_name: | H2O_from_python_wanghan_qzy4ws |
| H2O_cluster_total_nodes: | 1 |
| H2O_cluster_free_memory: | 1.060 Gb |
| H2O_cluster_total_cores: | 8 |
| H2O_cluster_allowed_cores: | 8 |
| H2O_cluster_status: | locked, healthy |
| H2O_connection_url: | http://localhost:54321 |
| H2O_connection_proxy: | {"http": null, "https": null} |
| H2O_internal_security: | False |
| Python_version: | 3.9.13 final |
# create a dataframe
text_h2o = h2o.H2OFrame(df[['school', 'sentence_lowered']])
def tokenize(sentences, stop_word = STOPWORDS):
tokenized = sentences.tokenize("\\W+")
tokenized_lower = tokenized.tolower()
tokenized_filtered = tokenized_lower[(tokenized_lower.nchar() >= 2) | (tokenized_lower.isna()),:]
tokenized_words = tokenized_filtered[tokenized_filtered.grep("[0-9]",invert=True,output_logical=True),:]
tokenized_words = tokenized_words[(tokenized_words.isna()) | (~ tokenized_words.isin(stop_word)),:]
return tokenized_words
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
words = tokenize(text_h2o['sentence_lowered'])
# train Word2Vec model
import random
random.seed(2023)
t1 = time.time()
# transform word to a matrix of vectors
w2v_model = H2OWord2vecEstimator(vec_size = 100,
window_size = 5,
sent_sample_rate = 0.1,
init_learning_rate = 0.025,
epochs = 20)
w2v_model.train(training_frame=words)
t2 = time.time()
print('Elapsed time:', np.round(t2-t1,2), 'secs')
word2vec Model Build progress: |█████████████████████████████████████████████████| (done) 100% Elapsed time: 321.25 secs
# find synonyms of word 'time'
w2v_model.find_synonyms('time', count =5)
OrderedDict([('elapsed', 0.708873450756073),
('duration', 0.6320406198501587),
('moment', 0.6101813912391663),
('periods', 0.6062440276145935),
('period', 0.6019760370254517)])
w2v_model.find_synonyms('food', count =5)
OrderedDict([('drink', 0.7303839921951294),
('meat', 0.6859197616577148),
('nourishment', 0.6842240691184998),
('clothing', 0.6706805229187012),
('feed', 0.6662881970405579)])
# change sentence to vector
sentence_vec = w2v_model.transform(words, aggregate_method= 'AVERAGE')
# combine 'school' to the dataframe
sentence_vec = sentence_vec.cbind(text_h2o['school'])
sentence_vec.head(5)
| C1 | C2 | C3 | C4 | C5 | C6 | C7 | C8 | C9 | C10 | C11 | C12 | C13 | C14 | C15 | C16 | C17 | C18 | C19 | C20 | C21 | C22 | C23 | C24 | C25 | C26 | C27 | C28 | C29 | C30 | C31 | C32 | C33 | C34 | C35 | C36 | C37 | C38 | C39 | C40 | C41 | C42 | C43 | C44 | C45 | C46 | C47 | C48 | C49 | C50 | C51 | C52 | C53 | C54 | C55 | C56 | C57 | C58 | C59 | C60 | C61 | C62 | C63 | C64 | C65 | C66 | C67 | C68 | C69 | C70 | C71 | C72 | C73 | C74 | C75 | C76 | C77 | C78 | C79 | C80 | C81 | C82 | C83 | C84 | C85 | C86 | C87 | C88 | C89 | C90 | C91 | C92 | C93 | C94 | C95 | C96 | C97 | C98 | C99 | C100 | school |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| -0.0270275 | -0.115236 | 0.192338 | 0.00423211 | 0.224575 | 0.115317 | 0.0897102 | 0.257108 | 0.11771 | 0.0437045 | -0.0395946 | -0.0104013 | 0.00076486 | -0.151811 | -0.075053 | 0.127939 | -0.148506 | 0.074448 | -0.00258222 | 0.146996 | 0.206723 | -0.0977651 | 0.042126 | -0.129466 | 0.136665 | 0.0513599 | 0.149747 | -0.0510237 | 0.0457752 | 0.139441 | 0.072057 | 0.174318 | 0.220235 | 0.136412 | 0.0549784 | 0.0838653 | -0.0728306 | -0.359166 | 0.010093 | -0.200572 | 0.0401647 | 0.148911 | 0.0819605 | 0.122058 | -0.117315 | 0.272924 | 0.0285401 | -0.135643 | -0.236269 | -0.293263 | 0.0593319 | -0.0391214 | 0.103596 | -0.189723 | 0.118731 | 0.057319 | 0.132578 | 0.157938 | -0.129218 | -0.0171481 | -0.118238 | -0.155995 | 0.296587 | -0.102452 | 0.0734612 | -0.0417783 | 0.00383338 | 0.0542067 | -0.152683 | -0.131286 | 0.0515924 | -0.0431747 | -0.00885805 | -0.252557 | 0.022155 | -0.0431722 | 0.22899 | -0.00453793 | 0.0412523 | -0.0302709 | -0.00283074 | 0.0186849 | -0.0399997 | -0.0631002 | 0.00747089 | 0.0443814 | 0.0579868 | 0.11093 | -0.058469 | 0.0582504 | 0.0629307 | -0.0432447 | 0.259089 | -0.19052 | -0.0597855 | 0.0555069 | -0.104113 | 0.0751708 | -0.0538824 | 0.0864369 | plato |
| -0.0322588 | -0.0870689 | 0.381064 | -0.140936 | 0.127958 | -0.0235513 | -0.0875878 | 0.215506 | 0.094162 | -0.264188 | -0.259692 | 0.0334019 | -0.113537 | -0.256103 | 0.0507584 | 0.0583892 | -0.0784186 | 0.120244 | -0.0260713 | 0.190949 | 0.380847 | -0.254405 | -0.0944675 | -0.182599 | 0.35621 | 0.0207392 | -0.0494949 | -0.204913 | 0.0187355 | 0.0644497 | 0.0538392 | 0.0958037 | 0.343886 | 0.215072 | 0.13738 | -0.0827233 | -0.323844 | -0.485399 | 0.0567551 | -0.219521 | 0.0833379 | 0.216616 | 0.0478998 | 0.209078 | -0.0363484 | 0.304463 | -0.0959812 | -0.0189733 | -0.230038 | -0.248723 | 0.0797265 | -0.0133533 | 0.264996 | -0.471248 | -0.0166321 | 0.0934934 | 0.231371 | 0.105037 | 0.088899 | -0.0946622 | -0.311939 | -0.156797 | 0.443288 | -0.379075 | -0.138669 | 0.0834882 | 0.108329 | 0.0327758 | -0.251777 | -0.0184545 | 0.0521639 | 0.0317343 | -0.301366 | -0.0486541 | 0.139182 | -0.00918948 | 0.189362 | 0.172462 | 0.117473 | -0.104104 | -0.0906448 | 0.0645012 | 0.0823389 | -0.00702505 | 0.0862936 | -0.119427 | 0.113979 | 0.10673 | 0.00179043 | -0.23442 | -0.111265 | -0.10381 | 0.268018 | -0.360795 | -0.0220825 | -0.0921651 | -0.182993 | -0.0215212 | 0.0541634 | 0.160532 | plato |
| -0.267391 | -0.070267 | 0.211407 | -0.0274868 | 0.336383 | -0.145132 | -0.20662 | 0.217596 | 0.089053 | -0.123701 | -0.245469 | -0.0740381 | -0.127687 | -0.0486155 | 0.184035 | -0.025948 | 0.11739 | -0.186106 | -0.292118 | 0.32003 | 0.271142 | -0.197925 | -0.239352 | 0.110026 | 0.0713032 | 0.169303 | -0.0223043 | -0.151276 | -0.0489049 | 0.0714303 | 0.0590714 | 0.231868 | 0.60257 | 0.416686 | -0.166118 | 0.321133 | -0.0099782 | -0.403663 | 0.0276375 | -0.174769 | 0.386128 | 0.0528172 | -0.0852312 | -0.00123767 | 0.183435 | 0.0785223 | -0.0131175 | -0.0264518 | -0.243594 | -0.175619 | 0.0821882 | -0.104072 | 0.32174 | -0.278318 | -0.105993 | 0.246244 | -0.0465767 | -0.226451 | 0.0618066 | -0.0251768 | -0.233715 | -0.289349 | 0.128921 | -0.412904 | -0.12997 | 0.00977698 | 0.0666957 | -0.13437 | -0.242807 | -0.0348449 | -0.0314411 | 0.107841 | -0.179068 | -0.148319 | 0.321547 | -0.187468 | 0.296351 | 0.148052 | 0.125854 | -0.0445568 | 0.108352 | 0.115356 | -0.173842 | -0.0380914 | 0.162994 | -0.00714817 | 0.093203 | 0.178699 | -0.0588869 | -0.0785141 | 0.0235059 | 0.105609 | 0.336313 | -0.448283 | -0.14429 | -0.0126349 | -0.375603 | -0.290007 | 0.00525011 | 0.0662187 | plato |
| -0.197838 | -0.0591242 | 0.277203 | 0.00106654 | 0.268289 | 0.141518 | -0.231243 | 0.318818 | 0.0066498 | -0.0669985 | -0.152388 | -0.167481 | -0.0226504 | 0.0355511 | -0.0473778 | -0.091728 | -0.113426 | 0.182329 | -0.140146 | -0.0429076 | -0.00240017 | -0.246328 | -0.0550874 | -0.00286175 | -0.00586401 | 0.0984587 | 0.00378198 | 0.0398661 | -0.0307649 | 0.16862 | -0.0964049 | 0.206545 | 0.198868 | 0.134969 | 0.259987 | 0.175453 | -0.062109 | -0.319505 | 0.14015 | 0.106599 | 0.412522 | 0.0485188 | 0.0482027 | 0.145662 | -0.188829 | 0.0983437 | -0.147681 | 0.0133 | -0.252817 | -0.079927 | 0.118723 | -0.0961553 | -0.0306009 | -0.203844 | -0.0347064 | 0.0387791 | 0.148103 | 0.0980942 | -0.260181 | -0.0813076 | -0.0549736 | -0.182983 | 0.0699712 | -0.276207 | -0.0460942 | 0.0744345 | -0.0604659 | 0.215949 | -0.123861 | -0.0349583 | 0.00731728 | 0.0539974 | 0.210896 | -0.157518 | 0.270154 | 0.149254 | 0.114987 | 0.168995 | -0.198532 | -0.349102 | 0.0146994 | 0.247339 | 0.151511 | -0.160515 | 0.0633216 | -0.0410114 | -0.110951 | 0.121368 | -0.0645883 | -0.0932481 | 0.0566011 | -0.11188 | 0.0396412 | -0.136326 | -0.0409276 | 0.0355243 | -0.07305 | 0.019929 | 0.280654 | -0.00741117 | plato |
| -0.00616367 | 0.119604 | 0.419397 | -0.055062 | 0.0549442 | 0.235722 | -0.163807 | 0.2245 | -0.135538 | -0.152318 | -0.39758 | -0.1857 | -0.193875 | 0.0125409 | -0.175968 | -0.0562862 | -0.268956 | 0.147765 | -0.158064 | -0.0636452 | 0.29723 | -0.199355 | 0.0638072 | -0.0377491 | 0.225663 | 0.099901 | 0.139706 | 0.0273046 | 0.003682 | 0.179475 | -0.0840987 | 0.126168 | 0.23357 | 0.35134 | 0.238219 | 0.0421745 | -0.140279 | -0.598417 | 0.0765858 | 0.0946514 | 0.337894 | 0.0762849 | -0.00913627 | -0.0481213 | -0.113573 | 0.0623061 | -0.261304 | 0.0028498 | -0.305145 | 0.0292923 | 0.00657962 | -0.0836349 | 0.165022 | -0.318534 | 0.0762372 | 0.165322 | 0.0728965 | 0.136026 | -0.0790542 | -0.102288 | 0.0783598 | -0.225731 | 0.274202 | -0.109607 | -0.104341 | 0.122111 | -0.0919462 | 0.179079 | -0.197445 | -0.168055 | 0.00663132 | 0.0451005 | 0.125427 | 0.0193237 | 0.327569 | 0.118589 | 0.114187 | 0.0412518 | 0.0837393 | -0.0725288 | 0.0385912 | 0.0857326 | 0.151191 | -0.2917 | 0.136008 | 0.0560022 | -0.134529 | 0.125555 | 0.101725 | -0.115702 | 0.0498814 | -0.102309 | 0.186173 | -0.208006 | -0.142885 | -0.150585 | -0.24508 | -0.196094 | 0.155825 | 0.348381 | plato |
[5 rows x 101 columns]
df_sentence_vec = sentence_vec.as_data_frame()
df_sentence_vec = df_sentence_vec.dropna(axis= 0)
df_sentence_vec
| C1 | C2 | C3 | C4 | C5 | C6 | C7 | C8 | C9 | C10 | ... | C92 | C93 | C94 | C95 | C96 | C97 | C98 | C99 | C100 | school | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.027028 | -0.115236 | 0.192338 | 0.004232 | 0.224575 | 0.115317 | 0.089710 | 0.257108 | 0.117710 | 0.043704 | ... | -0.043245 | 0.259089 | -0.190520 | -0.059785 | 0.055507 | -0.104113 | 0.075171 | -0.053882 | 0.086437 | plato |
| 1 | -0.032259 | -0.087069 | 0.381064 | -0.140936 | 0.127958 | -0.023551 | -0.087588 | 0.215506 | 0.094162 | -0.264188 | ... | -0.103810 | 0.268018 | -0.360795 | -0.022082 | -0.092165 | -0.182993 | -0.021521 | 0.054163 | 0.160532 | plato |
| 2 | -0.267391 | -0.070267 | 0.211407 | -0.027487 | 0.336383 | -0.145132 | -0.206620 | 0.217596 | 0.089053 | -0.123701 | ... | 0.105609 | 0.336313 | -0.448283 | -0.144290 | -0.012635 | -0.375603 | -0.290007 | 0.005250 | 0.066219 | plato |
| 3 | -0.197838 | -0.059124 | 0.277203 | 0.001067 | 0.268289 | 0.141518 | -0.231243 | 0.318818 | 0.006650 | -0.066998 | ... | -0.111880 | 0.039641 | -0.136326 | -0.040928 | 0.035524 | -0.073050 | 0.019929 | 0.280654 | -0.007411 | plato |
| 4 | -0.006164 | 0.119604 | 0.419397 | -0.055062 | 0.054944 | 0.235722 | -0.163807 | 0.224500 | -0.135538 | -0.152318 | ... | -0.102309 | 0.186173 | -0.208006 | -0.142885 | -0.150585 | -0.245080 | -0.196094 | 0.155825 | 0.348381 | plato |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 360803 | 0.211590 | -0.087330 | 0.088609 | 0.016126 | -0.045331 | 0.090883 | 0.065111 | 0.110797 | 0.266033 | 0.121365 | ... | 0.064363 | 0.004042 | -0.200736 | 0.037536 | 0.026489 | -0.096421 | 0.036099 | 0.016351 | 0.245053 | feminism |
| 360804 | 0.113471 | -0.216399 | 0.175970 | 0.051783 | -0.054340 | -0.087218 | -0.048680 | 0.112736 | 0.281851 | 0.084941 | ... | 0.054292 | 0.129044 | -0.234983 | 0.187192 | 0.107946 | -0.069528 | -0.090186 | 0.086337 | -0.030632 | feminism |
| 360805 | 0.144589 | 0.044592 | 0.160965 | -0.152926 | -0.032317 | 0.044778 | 0.018068 | 0.124104 | 0.265405 | 0.084099 | ... | 0.144931 | 0.063982 | -0.326962 | 0.243828 | 0.096736 | 0.032914 | -0.174537 | 0.000941 | 0.092422 | feminism |
| 360806 | 0.106469 | -0.040506 | 0.156078 | -0.060295 | -0.051348 | -0.002057 | -0.015109 | 0.211921 | 0.196765 | -0.001303 | ... | 0.140586 | 0.016712 | -0.179075 | 0.021467 | 0.074203 | -0.020293 | -0.134596 | 0.086264 | 0.057831 | feminism |
| 360807 | 0.195210 | -0.015442 | 0.062635 | 0.007586 | -0.050702 | -0.025424 | -0.047338 | 0.116275 | 0.308264 | 0.176540 | ... | 0.127512 | -0.059181 | -0.201298 | 0.050745 | 0.162582 | -0.090067 | 0.043041 | -0.029861 | 0.016354 | feminism |
360282 rows × 101 columns
import umap.umap_ as umap
# for clarity we take samples
df_sentence_vec_sample = df_sentence_vec.sample(20000)
# run UMAP algorithm to get a low dimension
dim = umap.UMAP(random_state = 2023)
t1 = time.time()
sentence_vec_umap = dim.fit_transform(df_sentence_vec_sample.iloc[:, 0:100])
t2 = time.time()
print('Time Cost: ', np.round(t2 - t1))
Time Cost: 15.0
# convert result matrix to data frame
sentence_vec_umap = pd.DataFrame(sentence_vec_umap, columns=['x','y'])
# and add school again
sentence_vec_umap['school'] = df_sentence_vec_sample.school.tolist()
sentence_vec_umap.head(3)
| x | y | school | |
|---|---|---|---|
| 0 | -1.434260 | 0.756006 | aristotle |
| 1 | 1.762848 | 3.543815 | aristotle |
| 2 | -3.283007 | 0.683291 | continental |
Observe that sentenses from the same school are clustering together. Some schools are more distinct from others, such as plato, analytic, aristotle and german idealism.
plt.figure(figsize=(12,12))
# Set color
sns.scatterplot(data=sentence_vec_umap,
x = 'x',
y = 'y',
hue = 'school',
alpha = 0.5,
s = 15,
palette = 'bright')
plt.legend(title = 'school', loc='upper right')
plt.title('UMAP Visualization')
plt.grid()
plt.show()
# train/test split
random.seed(666)
percent = 0.7
data_split = sentence_vec.split_frame(ratios=[percent])
print("Train Data: ", data_split[0].shape,"\nValidation Data: ", data_split[1].shape)
Train Data: (252242, 101) Validation Data: (108566, 101)
# Implement Gradient Boosting Model
n_CV = 5 # number of cross validations
gbm_model = H2OGradientBoostingEstimator(ntrees=200,
max_depth=4,
col_sample_rate=0.5,
min_rows=10,
nfolds=n_CV,
seed=666)
X=sentence_vec.columns[0:100]
# train model
t1 = time.time()
gbm_model.train(x = X,
y = 'school',
training_frame = data_split[0],
validation_frame = data_split[1])
t2 = time.time()
print('Elapsed time:', np.round(t2-t1,2), 'secs')
gbm Model Build progress: |██████████████Job request failed Unexpected HTTP error: ('Connection aborted.', BadStatusLine('GET /3/Jobs/$03017f00000132d4ffffffff$_9b7f6d015aff324d807d4fb8b0a54223 HTTP/1.1\r\n')), will retry after 3s.
████████████████████████████████████████| (done) 100%
Elapsed time: 1507.95 secs
# summary of the model
gbm_model.cross_validation_metrics_summary()
| mean | sd | cv_1_valid | cv_2_valid | cv_3_valid | cv_4_valid | cv_5_valid | |
|---|---|---|---|---|---|---|---|
| accuracy | 0.7099262 | 0.0008999 | 0.7095707 | 0.7113813 | 0.7092822 | 0.70921 | 0.7101867 |
| auc | nan | 0.0 | nan | nan | nan | nan | nan |
| err | 0.2900738 | 0.0008999 | 0.2904293 | 0.2886187 | 0.2907179 | 0.29079 | 0.2898133 |
| err_count | 14633.8 | 72.63401 | 14660.0 | 14551.0 | 14733.0 | 14650.0 | 14575.0 |
| logloss | 0.9295393 | 0.0047388 | 0.9315212 | 0.9254441 | 0.9367675 | 0.9283601 | 0.9256037 |
| max_per_class_error | 0.5847426 | 0.0284677 | 0.5532544 | 0.5734870 | 0.6187845 | 0.5675676 | 0.6106195 |
| mean_per_class_accuracy | 0.6740539 | 0.0022980 | 0.6761600 | 0.6757663 | 0.6704702 | 0.674602 | 0.6732712 |
| mean_per_class_error | 0.3259461 | 0.0022980 | 0.3238400 | 0.3242337 | 0.3295298 | 0.3253981 | 0.3267288 |
| mse | 0.2888025 | 0.0010639 | 0.2894726 | 0.2880128 | 0.2903351 | 0.2878637 | 0.2883281 |
| pr_auc | nan | 0.0 | nan | nan | nan | nan | nan |
| r2 | 0.9796402 | 0.0000709 | 0.9795985 | 0.9797645 | 0.9796085 | 0.9795974 | 0.9796324 |
| rmse | 0.5374028 | 0.0009894 | 0.5380266 | 0.5366682 | 0.5388275 | 0.5365293 | 0.5369619 |
# predictions on validation set
pred = gbm_model.predict(data_split[1][X])
pred = pred.as_data_frame()
pred.head()
gbm prediction progress: |███████████████████████████████████████████████████████| (done) 100%
| predict | analytic | aristotle | capitalism | communism | continental | empiricism | feminism | german_idealism | nietzsche | phenomenology | plato | rationalism | stoicism | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | plato | 0.000836 | 0.066826 | 0.000696 | 0.000641 | 0.000280 | 0.001766 | 0.000299 | 0.000655 | 0.000668 | 0.000428 | 0.923840 | 0.002860 | 0.000205 |
| 1 | plato | 0.012185 | 0.000793 | 0.000073 | 0.000130 | 0.000225 | 0.001025 | 0.000463 | 0.000849 | 0.000498 | 0.000642 | 0.981836 | 0.001228 | 0.000054 |
| 2 | aristotle | 0.017908 | 0.681998 | 0.001687 | 0.004387 | 0.003960 | 0.002530 | 0.008268 | 0.006916 | 0.009270 | 0.003417 | 0.245634 | 0.010416 | 0.003610 |
| 3 | aristotle | 0.028480 | 0.558274 | 0.012511 | 0.010945 | 0.022826 | 0.032495 | 0.014772 | 0.014947 | 0.023823 | 0.011571 | 0.209312 | 0.056392 | 0.003652 |
| 4 | plato | 0.002624 | 0.030288 | 0.000823 | 0.000786 | 0.001885 | 0.002024 | 0.002972 | 0.003569 | 0.003340 | 0.001435 | 0.937941 | 0.008788 | 0.003525 |
# extract the actual value from the data frame
# compare values with the predicted values
y = data_split[1]['school'].as_data_frame()
actuals = y['school']
actuals
res = pred.predict == actuals
acc = round(res.sum()/len(pred)*100,2)
print("Accurarcy rate of GBM model is: ", acc, "%")
Accurarcy rate of GBM model is: 70.83 %
# Confusion Matrix
conf = pd.crosstab(pred.predict, actuals)
conf
# Visualize Confusion Matrix
plt.figure(figsize=(12,8))
sns.heatmap(data = conf,
annot = True,
fmt = 'g',
cmap = 'Reds')
plt.show()
# compute the accurarcy rate for each type of school
all_count = y.value_counts().sort_index()
diag = np.diag(conf)
precent_acc = diag/all_count*100
# print out values
print('Accurarcy Rate:\n')
for index, value in precent_acc.items():
index_str = ','.join(index)
print(f"{index_str}: {round(value,2)}%")
Accurarcy Rate: analytic: 76.96% aristotle: 75.51% capitalism: 79.83% communism: 70.42% continental: 70.05% empiricism: 66.16% feminism: 65.27% german_idealism: 77.41% nietzsche: 56.42% phenomenology: 64.88% plato: 67.98% rationalism: 61.14% stoicism: 41.21%
# import packages
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()
[nltk_data] Downloading package vader_lexicon to [nltk_data] /Users/wanghan/nltk_data... [nltk_data] Package vader_lexicon is already up-to-date!
sentence_ = df[['sentence_lowered','school']]
sentence_['sentiment_scores'] = df['sentence_lowered'].apply(lambda x: sia.polarity_scores(x))
/var/folders/bh/x8nxlsvd33l8xvwp89lqfxjw0000gn/T/ipykernel_58712/2626219395.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy sentence_['sentiment_scores'] = df['sentence_lowered'].apply(lambda x: sia.polarity_scores(x))
def get_sentiment_label(score):
if score > 0.05:
return "postive"
elif score < -0.05:
return "negative"
else:
return "neutral"
sentence_['sentiment_label'] = sentence_['sentiment_scores'].apply(lambda x: get_sentiment_label(x['compound']))
/var/folders/bh/x8nxlsvd33l8xvwp89lqfxjw0000gn/T/ipykernel_58712/253699721.py:9: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy sentence_['sentiment_label'] = sentence_['sentiment_scores'].apply(lambda x: get_sentiment_label(x['compound']))
sentence_sentiment = sentence_[['school', 'sentiment_label']]
sentence_sentiment.groupby('school').value_counts()
school sentiment_label
analytic neutral 24846
postive 19325
negative 11254
aristotle postive 19113
neutral 18472
negative 11194
capitalism postive 10199
neutral 4406
negative 3589
communism postive 7386
neutral 6665
negative 3907
continental postive 11341
neutral 11246
negative 11192
empiricism postive 10014
neutral 5363
negative 4554
feminism postive 8446
negative 5955
neutral 4234
german_idealism postive 18896
neutral 15289
negative 7951
nietzsche postive 5759
negative 4052
neutral 3737
phenomenology neutral 12818
postive 10268
negative 5487
plato postive 17442
neutral 12778
negative 8146
rationalism postive 11627
neutral 5928
negative 5394
stoicism postive 1103
neutral 749
negative 683
dtype: int64